import scrapy
import json
from scrapy import cmdline


# Ipfs 网站爬取类
class IpfsSpider(scrapy.Spider):
    # 运行名称
    name = "ipfs"
    # 访问的域名
    allowed_domains = ["ipfs.infura-ipfs.io"]
    # 访问地址
    # start_urls = ["https://ipfs.infura-ipfs.io/ipfs/bafybeih65no5dklpqfe346wyeiak6wzemv5d7z2ya7nssdgwdz4xrmdu6i/"]
    # start_urls = ["https://ipfs.eth.aragon.network/ipfs/bafybeih65no5dklpqfe346wyeiak6wzemv5d7z2ya7nssdgwdz4xrmdu6i/"]
    start_urls = ["https://ipfs.io/ipfs/bafybeih65no5dklpqfe346wyeiak6wzemv5d7z2ya7nssdgwdz4xrmdu6i/"]
    # 激活码输出路径
    codeDir = 'D:/file/JetBrains/激活码_python爬取2'

    # 解析方法
    def parse(self, response):
        print("解析完成：")
        # response的属性和方法
        # response.text   获取的是响应的字符串
        # response.body   获取的是二进制数据
        # response.xpath  可以直接是xpath方法来解析response中的内容
        # response.extract()   提取seletor对象的data属性值
        # response.extract_first() 提取的seletor列表的第一个数据
        # print('响应字符串：', response.text)
        # print('响应二进制文件：', response.body)

        # xpath(//)：全局查找
        # xpath(./)：局部查找

        start_flag = 'let jbKeys = '
        end_flag = '}};'

        jbKeys = {}

        scripts = response.xpath("//script")
        for script in scripts:
            script = script.extract()
            # 判断是否存在该标识
            start_num = script.find(start_flag)
            end_num = script.find(end_flag)
            if start_num == -1 or end_num == -1:
                continue

            jbKeysStr = script[start_num + len(start_flag):end_num + len(end_flag) - 1]
            jbKeys = json.loads(jbKeysStr)

        # 获取每个方块
        article_list = response.xpath("//article[@class='card']")
        for article in article_list:
            # 获取数据标识
            data_sequence = article.attrib["data-sequence"]
            # 获取指定div
            div_list = article.xpath("./div[@class='pd-6 overflow-hidden bg-card container radius-1']")
            for div in div_list:
                h1 = div.xpath('./h1[@class="truncate truncate-1 color-primary mt-0 overflow-ellipsis"]')
                # 获取title属性内容
                title = h1.attrib['title']
                # print("标题：", title)

                # 获取激活码
                codeMap = jbKeys.get(data_sequence)
                for key, val in codeMap.items():
                    codeTitle = title + '_' + key
                    print("激活码标题：", codeTitle)
                    # 替换特殊符号
                    codeTitle = (codeTitle.replace('|', '')
                                 .replace('/', '')
                                 .replace(':', '')
                                 )
                    # 输出到文件
                    f = open(self.codeDir + '/' + codeTitle + '.txt', 'w')
                    # 写入内容
                    f.write(val)
                    # 关闭这个文件
                    f.close()

        print("==========================解析完成===================================>")
        pass


# 主函数启动
if __name__ == '__main__':
    # ipfs：spiders/ipfs.py 中类的name
    cmdline.execute("scrapy crawl ipfs".split())
